Explore cleaned data of back-calculated length-at-age of perch from gillnet-survey data from SLU databases

Author

Max Lindmark, Jan Ohlberger, Anna Gårdmark

Published

June 7, 2023

Load libraries

Code
pkgs <- c("tidyverse", "tidylog", "rnaturalearth", "rnaturalearthdata", "RColorBrewer",
          "forcats", "viridis", "broom", "rgdal", "ggmap", "sf", "ggridges") 

## minpack.lm needed if using nlsLM()
if(length(setdiff(pkgs,rownames(installed.packages()))) > 0){

    install.packages(setdiff(pkgs, rownames(installed.packages())),dependencies = T)
  
  }

invisible(lapply(pkgs, library, character.only = T))
#> ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
#> ✔ dplyr     1.1.2     ✔ readr     2.1.4
#> ✔ forcats   1.0.0     ✔ stringr   1.5.0
#> ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
#> ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
#> ✔ purrr     1.0.1     
#> ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
#> ✖ dplyr::filter() masks stats::filter()
#> ✖ dplyr::lag()    masks stats::lag()
#> ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
#> 
#> Attaching package: 'tidylog'
#> 
#> 
#> The following objects are masked from 'package:dplyr':
#> 
#>     add_count, add_tally, anti_join, count, distinct, distinct_all,
#>     distinct_at, distinct_if, filter, filter_all, filter_at, filter_if,
#>     full_join, group_by, group_by_all, group_by_at, group_by_if,
#>     inner_join, left_join, mutate, mutate_all, mutate_at, mutate_if,
#>     relocate, rename, rename_all, rename_at, rename_if, rename_with,
#>     right_join, sample_frac, sample_n, select, select_all, select_at,
#>     select_if, semi_join, slice, slice_head, slice_max, slice_min,
#>     slice_sample, slice_tail, summarise, summarise_all, summarise_at,
#>     summarise_if, summarize, summarize_all, summarize_at, summarize_if,
#>     tally, top_frac, top_n, transmute, transmute_all, transmute_at,
#>     transmute_if, ungroup
#> 
#> 
#> The following objects are masked from 'package:tidyr':
#> 
#>     drop_na, fill, gather, pivot_longer, pivot_wider, replace_na,
#>     spread, uncount
#> 
#> 
#> The following object is masked from 'package:stats':
#> 
#>     filter
#> 
#> 
#> 
#> Attaching package: 'rnaturalearthdata'
#> 
#> 
#> The following object is masked from 'package:rnaturalearth':
#> 
#>     countries110
#> 
#> 
#> Loading required package: viridisLite
#> 
#> Loading required package: sp
#> 
#> Please note that rgdal will be retired during 2023,
#> plan transition to sf/stars/terra functions using GDAL and PROJ
#> at your earliest convenience.
#> See https://r-spatial.org/r/2022/04/12/evolution.html and https://github.com/r-spatial/evolution
#> rgdal: version: 1.6-6, (SVN revision 1201)
#> Geospatial Data Abstraction Library extensions to R successfully loaded
#> Loaded GDAL runtime: GDAL 3.4.2, released 2022/03/08
#> Path to GDAL shared files: /Library/Frameworks/R.framework/Versions/4.2/Resources/library/rgdal/gdal
#> GDAL binary built with GEOS: FALSE 
#> Loaded PROJ runtime: Rel. 8.2.1, January 1st, 2022, [PJ_VERSION: 821]
#> Path to PROJ shared files: /Library/Frameworks/R.framework/Versions/4.2/Resources/library/rgdal/proj
#> PROJ CDN enabled: FALSE
#> Linking to sp version:1.6-0
#> To mute warnings of possible GDAL/OSR exportToProj4() degradation,
#> use options("rgdal_show_exportToProj4_warnings"="none") before loading sp or rgdal.
#> 
#> ℹ Google's Terms of Service: <https://mapsplatform.google.com>
#> ℹ Please cite ggmap if you use it! Use `citation("ggmap")` for details.
#> Linking to GEOS 3.10.2, GDAL 3.4.2, PROJ 8.2.1; sf_use_s2() is TRUE

# devtools::install_github("seananderson/ggsidekick") ## not on CRAN 
library(ggsidekick)
theme_set(theme_sleek())

Read data

Code
d <- readr::read_csv("https://raw.githubusercontent.com/maxlindmark/perch-growth/master/data/for_analysis/dat.csv")
#> Rows: 364546 Columns: 11
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: ","
#> chr (5): age_ring, area, gear, ID, sex
#> dbl (6): length_mm, age_bc, age_catch, catch_year, cohort, final_length
#> 
#> ℹ Use `spec()` to retrieve the full column specification for this data.
#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

glimpse(d)
#> Rows: 364,546
#> Columns: 11
#> $ length_mm    <dbl> 62, 115, 153, 168, 184, 51, 110, 143, 155, 163, 71, 112, …
#> $ age_bc       <dbl> 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 1, 2, 3, 4, 5, …
#> $ age_catch    <dbl> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 5, 5, 5, 5, 5, …
#> $ age_ring     <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y…
#> $ area         <chr> "BT", "BT", "BT", "BT", "BT", "BT", "BT", "BT", "BT", "BT…
#> $ catch_year   <dbl> 1977, 1977, 1977, 1977, 1977, 1977, 1977, 1977, 1977, 197…
#> $ cohort       <dbl> 1972, 1972, 1972, 1972, 1972, 1972, 1972, 1972, 1972, 197…
#> $ final_length <dbl> 184, 184, 184, 184, 184, 163, 163, 163, 163, 163, 178, 17…
#> $ gear         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
#> $ ID           <chr> "1977_1_BT_male", "1977_1_BT_male", "1977_1_BT_male", "19…
#> $ sex          <chr> "male", "male", "male", "male", "male", "male", "male", "…

# Use only length-at-age by filtering on age_ring
# Since there is growth after the age ring was formed (+ growth), 
# we filter to get lengths that correspond to age rings only
d <- d |> filter(age_ring == "Y")
#> filter: removed 28,878 rows (8%), 335,668 rows remaining

# age_bc is back-calculated age
# age_catch is age at catch

Plot data

All data

Code
ggplot(d, aes(age_bc, length_mm, color = area)) +
  geom_jitter(height = 0, alpha = 0.2) +
  scale_color_brewer(palette = "Paired")

Code

ggplot(d, aes(age_bc, length_mm, color = area)) +
  geom_jitter(height = 0, alpha = 0.2) + 
  guides(color = "none") + 
  scale_color_brewer(palette = "Paired") +
  facet_wrap(~ area)

Sample locations & length of time series

Code
sf::sf_use_s2(FALSE)
#> Spherical geometry (s2) switched off

# Read UTM function
LongLatToUTM <- function(x, y, zone){
  xy <- data.frame(ID = 1:length(x), X = x, Y = y)
  coordinates(xy) <- c("X", "Y")
  proj4string(xy) <- CRS("+proj=longlat +datum=WGS84")  ## for example
  res <- spTransform(xy, CRS(paste("+proj=utm +zone=",zone," ellps=WGS84",sep='')))
  return(as.data.frame(res))
}

# Specify ranges for big map
ymin = 52; ymax = 67; xmin = 11; xmax = 25

map_data <- rnaturalearth::ne_countries(
  scale = "medium",
  returnclass = "sf", continent = "europe")

# Crop the polygon for plotting and efficiency:
# st_bbox(map_data) # find the rough coordinates
swe_coast <- suppressWarnings(suppressMessages(
  st_crop(map_data,
          c(xmin = xmin, ymin = ymin, xmax = xmax, ymax = ymax))))

# Transform our map into UTM 33 coordinates, which is the equal-area projection we fit in:
utm_zone33 <- 32633
swe_coast_proj <- sf::st_transform(swe_coast, crs = utm_zone33)

# Add point to areas
sort(unique(d$area))
#>  [1] "BS"    "BT"    "FB"    "FM"    "HO"    "JM"    "MU"    "RA"    "SI_EK"
#> [10] "SI_HA" "TH"    "VN"

df <- data.frame(Area = c("Brunskar (BS)", "Biotest (BT)", "Finbo (FB)", "Forsmark (FM)",
                          "Holmon (HO)", "Kvadofjarden (JM)", "Musko (MU)", "Ranea (RA)",
                          "Simpevarp 1 (SI_EK", "Simpevarp 2 (SI_HA", "Torhamn (TH)", "Vino (VN)"),
                 lon = c(21.5, 18.1, 19.5, 18, 20.9, 16.8, 18.1, 22.3, 16.6, 16.7, 15.9, 16.9),
                 lat = c(60, 60.4, 60.3, 60.5, 63.7, 58, 59, 65.9, 57.3, 57.4, 56.1, 57.5))

# Add UTM coords
utm_coords <- LongLatToUTM(df$lon, df$lat, zone = 33)
df$X <- utm_coords$X
df$Y <- utm_coords$Y

# Crop the plot
xmin2 <- 330000; xmax2 <- 959000; xrange <- xmax - xmin
ymin2 <- 6000000; ymax2 <- 7300000; yrange <- ymax - ymin

ggplot(swe_coast_proj) +
  geom_sf() +
  coord_sf(xlim = c(xmin2, xmax2), ylim = c(ymin2, ymax2)) +
  geom_point(data = df, aes(x = X, y = Y, color = Area), size = 3) +
  labs(x = "Longitude", y = "Latitude") +
  scale_color_brewer(palette = "Paired")

Sample sizes

Code
# Average sample size by ID
d |> 
  group_by(cohort, area, ID) |> 
  summarise(n = n()) |> 
  ungroup() |> 
  ggplot(aes(x = n, y = area, n, fill = area)) +
  geom_density_ridges(stat = "binline", bins = 20, scale = 1, draw_baseline = FALSE, alpha = 0.8) +
  scale_fill_brewer(palette = "Paired") +
  guides(fill = "none")
#> group_by: 3 grouping variables (cohort, area, ID)
#> summarise: now 92,063 rows and 4 columns, 2 group variables remaining (cohort, area)
#> ungroup: no grouping variables

Code

# Average sample size by area, cohort and age
d |> 
  group_by(cohort, area, age_bc) |> 
  summarise(n = n()) |> 
  ungroup() |> 
  ggplot(aes(cohort, age_bc,color = n, fill = n)) +
  facet_wrap(~area) +
  geom_tile(shape = 21, alpha = 0.8) +
  scale_color_viridis() +
  scale_fill_viridis()
#> group_by: 3 grouping variables (cohort, area, age_bc)
#> summarise: now 4,274 rows and 4 columns, 2 group variables remaining (cohort, area)
#> ungroup: no grouping variables
#> Warning in geom_tile(shape = 21, alpha = 0.8): Ignoring unknown parameters:
#> `shape`

Code

# Sample size by gear (some overlapping gears with different names)
d |> 
  group_by(gear, area) |> 
  summarise(n = n()) |> 
  ggplot(aes(factor(gear), n, fill = area)) +
  geom_bar(stat = "identity") +
  guides(fill = "none") +
  facet_wrap(~area, scales = "free") +
  scale_fill_brewer(palette = "Paired") +
  theme(axis.text.x = element_text(angle = 90))
#> group_by: 2 grouping variables (gear, area)
#> summarise: now 37 rows and 3 columns, one group variable remaining (gear)

Code

# Plot sample size by area and cohort (all length-at-ages)
d |> 
  group_by(cohort, area) |> 
  summarise(n = n()) |> 
  ggplot(aes(cohort, n, fill = area)) +
  geom_bar(stat = "identity") + 
  scale_fill_brewer(palette = "Paired")
#> group_by: 2 grouping variables (cohort, area)
#> summarise: now 505 rows and 3 columns, one group variable remaining (cohort)

Code

d |> 
  group_by(cohort, area) |> 
  summarise(n = n()) |> 
  ggplot(aes(cohort, n, color = area)) +
  geom_line() + 
  scale_color_brewer(palette = "Paired") +
  facet_wrap(~area) +
  guides(color = "none") +
  theme(axis.text.x = element_text(angle = 90))
#> group_by: 2 grouping variables (cohort, area)
#> summarise: now 505 rows and 3 columns, one group variable remaining (cohort)

Code

# Plot sample size by area and catch_year
d |> 
  group_by(catch_year, area) |> 
  summarise(n = n()) |> 
  ggplot(aes(catch_year, n, fill = area)) +
  geom_bar(stat = "identity") + 
  scale_fill_brewer(palette = "Paired")
#> group_by: 2 grouping variables (catch_year, area)
#> summarise: now 372 rows and 3 columns, one group variable remaining (catch_year)

Code

d |> 
  group_by(catch_year, area) |> 
  summarise(n = n()) |> 
  ggplot(aes(catch_year, n, color = area)) +
  geom_line() + 
  scale_color_brewer(palette = "Paired") +
  facet_wrap(~area) +
  guides(color = "none") +
  theme(axis.text.x = element_text(angle = 90))
#> group_by: 2 grouping variables (catch_year, area)
#> summarise: now 372 rows and 3 columns, one group variable remaining (catch_year)

Time-slopes of length-at-age

Code
# Calculate time-slopes by age and area
time_slopes_by_year_area <- d |>
  group_by(age_bc, area) |> # center length at age for comparison across ages
  mutate(length_mm_ct = length_mm / mean(length_mm)) |> 
  ungroup() |> 
  mutate(id = paste(age_bc, area, sep = ";")) %>%
  split(.$id) |>
  purrr::map(~lm(length_mm_ct ~ catch_year, data = .x)) |>
  purrr::map_df(broom::tidy, .id = 'id') |>
  filter(term == 'catch_year') |> 
  separate(id, into = c("age_bc", "area"), sep = ";") |> 
  mutate(upr = estimate + std.error*2, lwr = estimate - std.error*2) |> 
  mutate(id = paste(age_bc, area, sep = ";"))
#> group_by: 2 grouping variables (age_bc, area)
#> mutate (grouped): new variable 'length_mm_ct' (double) with 37,708 unique values and 0% NA
#> ungroup: no grouping variables
#> mutate: new variable 'id' (character) with 167 unique values and 0% NA
#> filter: removed 167 rows (50%), 167 rows remaining
#> mutate: new variable 'upr' (double) with 154 unique values and 9% NA
#>         new variable 'lwr' (double) with 154 unique values and 9% NA
#> mutate: new variable 'id' (character) with 167 unique values and 0% NA

time_slopes_by_year_area
#> # A tibble: 167 × 10
#>    age_bc area  term      estimate std.error statistic   p.value     upr     lwr
#>    <chr>  <chr> <chr>        <dbl>     <dbl>     <dbl>     <dbl>   <dbl>   <dbl>
#>  1 1      BS    catch_ye…  0.00474 0.000451      10.5  1.53e- 25 0.00564 0.00384
#>  2 1      BT    catch_ye…  0.0137  0.000175      78.7  0         0.0141  0.0134 
#>  3 1      FB    catch_ye…  0.00444 0.0000989     44.9  0         0.00464 0.00424
#>  4 1      FM    catch_ye…  0.00943 0.0000843    112.   0         0.00960 0.00927
#>  5 1      HO    catch_ye…  0.00463 0.000186      24.9  6.21e-132 0.00501 0.00426
#>  6 1      JM    catch_ye…  0.00429 0.0000808     53.1  0         0.00445 0.00413
#>  7 1      MU    catch_ye…  0.00762 0.000489      15.6  2.50e- 53 0.00860 0.00665
#>  8 1      RA    catch_ye…  0.00285 0.000636       4.48 7.93e-  6 0.00412 0.00158
#>  9 1      SI_EK catch_ye…  0.00581 0.000171      34.0  4.94e-231 0.00616 0.00547
#> 10 1      SI_HA catch_ye…  0.00480 0.000148      32.5  9.65e-217 0.00510 0.00451
#> # ℹ 157 more rows
#> # ℹ 1 more variable: id <chr>

# Add sample size so that we can filter on that
sample_size <- d |> 
  group_by(age_bc, area) |> 
  summarise(n = n()) |> 
  ungroup() |> 
  mutate(id = paste(age_bc, area, sep = ";")) |> 
  dplyr::select(n, id)
#> group_by: 2 grouping variables (age_bc, area)
#> summarise: now 167 rows and 3 columns, one group variable remaining (age_bc)
#> ungroup: no grouping variables
#> mutate: new variable 'id' (character) with 167 unique values and 0% NA

# Join sample size
time_slopes_by_year_area <- left_join(time_slopes_by_year_area, sample_size)
#> Joining with `by = join_by(id)`
#> left_join: added one column (n)
#>            > rows only in x     0
#>            > rows only in y  (  0)
#>            > matched rows     167
#>            >                 =====
#>            > rows total       167

# Plot effect sizes
time_slopes_by_year_area |>
  filter(n > 30) |> 
  ggplot(aes(reorder(age_bc, as.numeric(age_bc)), estimate, color = factor(area))) + 
  geom_point(position = position_dodge(width = 0.4)) +
  geom_errorbar(aes(ymin = lwr, ymax = upr), width = 0.2,
                position = position_dodge(width = 0.4)) +
  geom_hline(yintercept = 0, linetype = "dashed") +
  scale_color_brewer(palette = "Paired") + 
  facet_wrap(~factor(area), scales = "free") + 
  labs(x = "Age", y = "slope: size~time") +
  theme(legend.position = "bottom")
#> filter: removed 53 rows (32%), 114 rows remaining

Code

time_slopes_by_year_area |>
  mutate(age_bc=as.numeric(age_bc)) |>
  filter(n > 30) |> 
  ggplot(aes(area,estimate,color = factor(age_bc))) + 
  geom_point(position = position_dodge(width = 0.4)) +
  geom_errorbar(aes(ymin = lwr, ymax = upr), width = 0.2,
                position = position_dodge(width = 0.4)) +
  geom_hline(yintercept = 0, linetype = "dashed") +
  scale_color_brewer(palette = "Paired") + 
  facet_wrap(~factor(age_bc), scales = "free") + 
  labs(x = "Age", y = "slope: size~time") +
  theme(legend.position = "bottom")
#> mutate: converted 'age_bc' from character to double (0 new NA)
#> filter: removed 53 rows (32%), 114 rows remaining
#> Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Paired is 12
#> Returning the palette you asked for with that many colors
#> Warning: Removed 1 rows containing missing values (`geom_point()`).

Code

time_slopes_by_year_area |>
  filter(n > 30) |> 
  ggplot(aes(reorder(age_bc, as.numeric(age_bc)), estimate, color = factor(area))) + 
  geom_point(position = position_dodge(width = 0.4)) +
  geom_errorbar(aes(ymin = lwr, ymax = upr), width = 0.2,
                position = position_dodge(width = 0.4)) +
  geom_hline(yintercept = 0, linetype = "dashed") +
  scale_color_brewer(palette = "Paired") + 
  labs(x = "Age", y = "slope: size~time")
#> filter: removed 53 rows (32%), 114 rows remaining

Code
  
time_slopes_by_year_area |>
  filter(n > 30) |> 
  ggplot(aes(as.numeric(age_bc), estimate)) + 
  geom_point(position = position_dodge(width = 0.4)) +
  geom_hline(yintercept = 0, linetype = "dashed") +
  stat_smooth(method = "gam", formula = y ~ s(x, k = 3)) +
  labs(x = "Age", y = "slope: size~time")
#> filter: removed 53 rows (32%), 114 rows remaining